Importing libraries, defninig new functions and re using some of the functions I wrote for performing quick analysis

In [1]:
"""Helper Functions for Plotting"""
import numpy as np
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected = True)

def generate_layout_bar(col_name):
    """
    Generate a layout object for bar chart
    """
    layout_bar = go.Layout(
        autosize=False,  # auto size the graph? use False if you are specifying the height and width
        width=800,  # height of the figure in pixels
        height=600,  # height of the figure in pixels
        title="Distribution of {} column".format(col_name),  # title of the figure
        # more granular control on the title font
        titlefont=dict(
            family='Courier New, monospace',  # font family
            size=14,  # size of the font
            color='black'  # color of the font
        ),
        # granular control on the axes objects
        xaxis=dict(
            tickfont=dict(
                family='Courier New, monospace',  # font family
                size=14,  # size of ticks displayed on the x axis
                color='black'  # color of the font
            )
        ),
        yaxis=dict(
            #         range=[0,100],
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
            tickfont=dict(
                family='Courier New, monospace',  # font family
                size=14,  # size of ticks displayed on the y axis
                color='black'  # color of the font
            )
        ),
        font=dict(
            family='Courier New, monospace',  # font family
            color="white",  # color of the font
            size=12  # size of the font displayed on the bar
        )
    )
    return layout_bar


def plot_count_bar(dataframe_name, col_name, top_n=None):
    """
    Plot a bar chart for the categorical columns

    Arguments:
    dataframe name
    categorical column name

    Output:
    Plot
    """
    # create a table with value counts
    temp = dataframe_name[col_name].value_counts()
    if top_n is not None:
        temp = temp.head(top_n)
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=temp.index.astype(str),  # x axis values
            y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100,  # y axis values
            text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
            # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition='auto',  # specify at which position on the bar the text should appear
            marker=dict(color='#0047AB'),)]  # change color of the bar
    # color used here Cobalt Blue
    layout_bar = generate_layout_bar(col_name=col_name)
    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)

def plot_bar(dataframe_name, cat_col_name, num_col_name, top_n = 20):
    """
    Plot a bar chart with the mentioned columns

    Arguments:
    dataframe name
    categorical column name
    numeric column name
    Output:
    Plot
    """
    # create a table with value counts
    dataframe_name = dataframe_name.sort_values(by = num_col_name, ascending = False)
    dataframe_name = dataframe_name.head(top_n)
    x = dataframe_name[cat_col_name]
    y = dataframe_name[num_col_name]
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=x,  # x axis values
            y=y,  # y axis values
            text=['{}%'.format(np.round(i,2)) for i in y],
            # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition='auto',  # specify at which position on the bar the text should appear
            marker=dict(color='#0047AB'),)]  # change color of the bar
    # color used here Cobalt Blue
    layout_bar = generate_layout_bar(col_name=cat_col_name)
    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)


def plot_hist(dataframe, col_name):
    """Plot histogram"""
    data = [go.Histogram(x=dataframe[col_name],
                         marker=dict(
        color='#CC0E1D',  # Lava (#CC0E1D)
        #         color = 'rgb(200,0,0)'   # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
    ))]
    layout = go.Layout(title="Histogram of {}".format(col_name))
    fig = go.Figure(data=data, layout=layout)
    return iplot(fig)


def plot_multi_box(dataframe, col_name, num_col_name):
    """Plot multiple box plots based on the levels in a column"""
    data = []
    for i in dataframe[col_name].unique():
        trace = go.Box(y=dataframe[num_col_name][dataframe[col_name] == i],
                       name=i)
        data.append(trace)
    layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name, num_col_name))
    fig = go.Figure(data=data, layout=layout)
    return (iplot(fig))
In [2]:
import sys
import os
import pandas as pd
from datetime import datetime
In [3]:
data_path = "data/"
In [4]:
def load_data(file_name):
    return pd.read_csv(data_path+f"{file_name}"+".csv")
In [5]:
def convert_to_pandas_datetime(col):
    return pd.to_datetime(col)

List all the files in the data path

In [6]:
os.listdir(data_path)
Out[6]:
['notifications.csv',
 'notifications_processed.csv',
 'rev-devices.csv',
 'rev-notifications.csv',
 'rev-transactions.csv',
 'rev-users.csv',
 'transactions_processed.csv',
 'users.csv',
 'users_processed.csv']
In [7]:
devices = load_data("rev-devices")
notifications = load_data("rev-notifications")
transactions = load_data("rev-transactions")
users = load_data("rev-users")
In [8]:
def dataset_info(df):
    info = {}
    info["n_rows"] = df.shape[0]
    info["n_cols"] = df.shape[1]
    print("First few rows")
    print(df.head())
    return info
In [9]:
dataset_info(devices)
First few rows
     brand    user_id
0  Android  user_3257
1    Apple  user_6809
2  Android  user_9144
3    Apple  user_3636
4  Android  user_5970
Out[9]:
{'n_rows': 19430, 'n_cols': 2}
In [10]:
dataset_info(notifications)
First few rows
                      reason channel status    user_id  \
0  REENGAGEMENT_ACTIVE_FUNDS    PUSH   SENT  user_7086   
1  REENGAGEMENT_ACTIVE_FUNDS    PUSH   SENT  user_6598   
2  REENGAGEMENT_ACTIVE_FUNDS    PUSH   SENT  user_4151   
3  REENGAGEMENT_ACTIVE_FUNDS    PUSH   SENT  user_1408   
4  REENGAGEMENT_ACTIVE_FUNDS    PUSH   SENT  user_6292   

                 created_date  
0  2018-12-02 17:58:33.320645  
1  2018-12-01 23:09:37.367127  
2  2018-12-04 02:57:56.425660  
3  2018-12-11 02:04:46.284683  
4  2018-12-14 17:09:58.900808  
Out[10]:
{'n_rows': 121813, 'n_cols': 5}
In [11]:
dataset_info(transactions)
First few rows
  transaction_id transactions_type transactions_currency  amount_usd  \
0  transaction_0          TRANSFER                   AED        4.55   
1  transaction_1      CARD_PAYMENT                   AED       15.50   
2  transaction_2      CARD_PAYMENT                   AED       43.40   
3  transaction_3          TRANSFER                   AED    10043.01   
4  transaction_4      CARD_PAYMENT                   AED       43.81   

  transactions_state ea_cardholderpresence  ea_merchant_mcc ea_merchant_city  \
0          COMPLETED                   NaN              NaN              NaN   
1          COMPLETED                 FALSE           4111.0            Dubai   
2          COMPLETED                 FALSE           5814.0            Dubai   
3          COMPLETED                   NaN              NaN              NaN   
4          COMPLETED                 FALSE           5651.0        Abu Dhabi   

  ea_merchant_country direction    user_id                created_date  
0                 NaN  OUTBOUND   user_898  2018-04-03 03:34:21.784487  
1                 ARE  OUTBOUND  user_1652  2019-03-19 06:15:59.537032  
2                 ARE  OUTBOUND  user_1652  2019-03-18 18:53:41.323032  
3                 NaN  OUTBOUND  user_1652  2019-03-22 14:20:01.513032  
4                 ARE  OUTBOUND  user_5509  2019-03-22 05:42:50.316652  
Out[11]:
{'n_rows': 2740075, 'n_cols': 12}
In [12]:
dataset_info(users)
First few rows
  user_id  birth_year country       city                created_date  \
0  user_0        1989      PL     Gdansk  2018-01-13 05:15:15.599466   
1  user_1        1975      GB     London  2018-01-29 03:38:46.676876   
2  user_2        1987      PL     Poznań  2018-01-18 19:17:31.229096   
3  user_3        1994      FR      Paris  2018-01-15 18:47:56.723104   
4  user_4        1985      GB  Beckenham  2018-01-11 00:36:46.673673   

   user_settings_crypto_unlocked      plan  \
0                              1  STANDARD   
1                              0  STANDARD   
2                              0  STANDARD   
3                              0  STANDARD   
4                              0  STANDARD   

   attributes_notifications_marketing_push  \
0                                      1.0   
1                                      NaN   
2                                      0.0   
3                                      1.0   
4                                      NaN   

   attributes_notifications_marketing_email  num_contacts  num_referrals  \
0                                       1.0             3              0   
1                                       NaN            21              0   
2                                       0.0            21              0   
3                                       0.0             0              0   
4                                       NaN             2              0   

   num_successful_referrals  
0                         0  
1                         0  
2                         0  
3                         0  
4                         0  
Out[12]:
{'n_rows': 19430, 'n_cols': 12}

Exploring the dataset

Checking for the number of unique users

In [13]:
users.user_id.nunique()
Out[13]:
19430

What is the time frame for which we have the data available?

Before we move ahead, let me convert the date to pandas date format for easier manipulation

In [13]:
users.created_date = convert_to_pandas_datetime(users.created_date)

transactions.created_date = convert_to_pandas_datetime(transactions.created_date)

notifications.created_date = convert_to_pandas_datetime(notifications.created_date)
In [17]:
print("We have user registration data for the {} days.".format((users.created_date.max() - users.created_date.min()).days))
We have user registration data for the 366 days.
In [18]:
print("We have user transactions for the {} days.".format((transactions.created_date.max() - transactions.created_date.min()).days))
We have user transactions for the 500 days.
In [19]:
print("We have user notifications for the {} days.".format((notifications.created_date.max() - notifications.created_date.min()).days))
We have user notifications for the 487 days.

Users

We can infer the age of the user from users birth year

In [20]:
users["age"] = 2020 - users.birth_year

Age group of the users

What is the age group of the users on the platform

In [21]:
plot_hist(users,"age")

From the the histogram, bulk of the users are in the age group of 25- 35

Most common plan opted ?

In [22]:
temp = pd.DataFrame(users.plan.value_counts())
In [23]:
plot_count_bar(users,"plan")

Over 92% of the users are on the standard plan and the bulk of the users are using STANDARD or PERMIUM plans

In [24]:
users.columns
Out[24]:
Index(['user_id', 'birth_year', 'country', 'city', 'created_date',
       'user_settings_crypto_unlocked', 'plan',
       'attributes_notifications_marketing_push',
       'attributes_notifications_marketing_email', 'num_contacts',
       'num_referrals', 'num_successful_referrals', 'age'],
      dtype='object')

Which country the is user base most concentrated in?

In [25]:
plot_count_bar(users, "country")

32% of the user base is from Great Britian

Where are the users from?

In [26]:
plot_count_bar(users, "city", top_n=10)

I have plotted the top 10 cities here, among the top 10 cities, London has ~42 % of the users and Warszawa comes next with ~10 % users

Is there a specific date where many users signed on?

In [27]:
(users.created_date.value_counts()>1).values
Out[27]:
array([False, False, False, ..., False, False, False])
In [28]:
users.created_date[(users.created_date.value_counts()>1).values]
Out[28]:
Series([], Name: created_date, dtype: datetime64[ns])

No 2 users signed up on the same day, since this is munged data, this might be the case

Percentage of users signed up for push notifications

In [29]:
plot_count_bar(users,"attributes_notifications_marketing_push" )

Only ~5% of the users did not opt for notifications

Percentage of users signed up for marketing emails

In [30]:
plot_count_bar(users,"attributes_notifications_marketing_email" )

~ 90 % of the users signed up for marketing emails

How many users unlocked crypto?

In [31]:
plot_count_bar(users,"user_settings_crypto_unlocked")

only 19% of the users unlocked crypto

Notifications

Most notifications were sent through which channel?

In [32]:
plot_count_bar(notifications, "channel")

Among the 3 notification channels, SMS is the least used mode

In [33]:
notifications.columns
Out[33]:
Index(['reason', 'channel', 'status', 'user_id', 'created_date'], dtype='object')
In [34]:
plot_count_bar(notifications, "status")

~ 27 % of the notifications failed to be delivered

Is a specific mode of notification more prone to fail?

In [35]:
temp = notifications.groupby(["channel", "status"]).size().to_frame().reset_index()
temp.columns = ["channel", "status", "counts"]
temp
Out[35]:
channel status counts
0 EMAIL FAILED 18052
1 EMAIL SENT 43201
2 PUSH FAILED 12397
3 PUSH SENT 45286
4 SMS FAILED 1887
5 SMS SENT 990
In [36]:
failed_x = temp.channel[temp.status=="FAILED"]
failed_y = (temp.counts[temp.status=="FAILED"]/temp.counts.sum())*100
sent_x = temp.channel[temp.status=="SENT"]
sent_y = (temp.counts[temp.status=="SENT"]/temp.counts.sum())*100
fig = go.Figure(data=[
    go.Bar(name='FAILED', x=failed_x, y=failed_y),
    go.Bar(name='SENT', x=sent_x, y=sent_y),
])
# Change the bar mode
fig.update_layout(barmode='group',title = "Percentage of notifications sent and failed (% calculated out of total)")
fig.show()

Most of the failed notifications were in the email mode

In [37]:
plot_count_bar(notifications, "reason")

Most notifications were sent for REENGAGEMENT_ACTIVE_FUNDS (almost 30%)

Devices

Which devices are the customers using?

In [38]:
plot_count_bar(devices,"brand")

The user base is equally sperad among apple and ancdroid

Transactions

In [39]:
transactions.columns
Out[39]:
Index(['transaction_id', 'transactions_type', 'transactions_currency',
       'amount_usd', 'transactions_state', 'ea_cardholderpresence',
       'ea_merchant_mcc', 'ea_merchant_city', 'ea_merchant_country',
       'direction', 'user_id', 'created_date'],
      dtype='object')

Most common type of transaction?

In [40]:
plot_count_bar(transactions, "transactions_type")

Customers are mostly using the platform for card payment and transfer.

Which currency do the users deal with?

In [41]:
plot_count_bar(transactions, "transactions_currency")

Most customers are dealing with Euros

In [42]:
transactions.columns
Out[42]:
Index(['transaction_id', 'transactions_type', 'transactions_currency',
       'amount_usd', 'transactions_state', 'ea_cardholderpresence',
       'ea_merchant_mcc', 'ea_merchant_city', 'ea_merchant_country',
       'direction', 'user_id', 'created_date'],
      dtype='object')
In [43]:
transactions.transactions_state.value_counts()
Out[43]:
COMPLETED    2407968
DECLINED      155286
REVERTED      112618
FAILED         41053
PENDING        21492
CANCELLED       1658
Name: transactions_state, dtype: int64

What is the probability that a transaction will fail? (irrespective of the type of transaction)

In [44]:
plot_count_bar(transactions,"transactions_state")

There is only a 1.5% probability that a transaction will fail

-This shows that the platform is very stable

Which day did most transactions take place?

In [45]:
transactions.created_date.value_counts()
Out[45]:
2018-12-04 12:13:34.572326    4
2018-12-13 14:49:24.314528    3
2018-12-22 07:24:30.960017    3
2018-10-25 10:06:43.624650    3
2019-05-09 08:48:56.864973    3
2018-08-05 23:29:21.330685    2
2018-12-03 05:19:05.449529    2
2018-10-24 02:47:33.252083    2
2019-02-16 23:07:45.901145    2
2019-01-30 02:13:12.541898    2
2019-04-03 07:33:08.375638    2
2018-09-22 21:53:29.340081    2
2018-09-02 23:55:08.726391    2
2019-01-31 04:30:36.153856    2
2019-03-18 23:21:58.432410    2
2018-11-23 12:47:18.662406    2
2019-01-08 12:06:38.604018    2
2018-06-17 20:03:51.291786    2
2019-01-23 19:41:25.756873    2
2018-06-20 00:25:33.437954    2
2018-11-18 19:08:16.231789    2
2018-12-07 10:45:50.575370    2
2018-09-30 04:32:29.268201    2
2018-09-27 08:08:21.310265    2
2018-11-09 14:38:04.437665    2
2019-05-02 14:39:43.684524    2
2018-11-21 09:30:43.720955    2
2019-04-28 12:43:03.365445    2
2018-10-14 18:52:04.229665    2
2018-09-26 22:12:46.779638    2
                             ..
2019-01-27 18:07:08.571264    1
2018-09-27 12:03:54.547807    1
2019-04-14 14:54:19.074902    1
2018-12-17 12:32:15.023836    1
2018-11-15 18:47:40.377053    1
2018-06-03 09:53:45.067883    1
2018-02-19 04:41:22.517555    1
2019-04-10 06:23:45.655686    1
2018-12-02 03:18:36.301046    1
2018-08-09 16:48:33.022789    1
2018-10-27 23:25:20.861284    1
2018-12-23 04:33:45.028340    1
2019-04-27 00:44:54.410616    1
2019-01-23 22:35:41.038390    1
2018-11-30 06:06:52.149837    1
2019-04-28 16:45:19.688557    1
2018-12-23 07:46:34.799445    1
2018-07-31 09:48:04.804621    1
2018-07-28 01:11:41.910091    1
2018-10-10 18:14:55.281795    1
2019-03-24 12:22:49.405216    1
2018-08-24 01:15:05.001653    1
2018-10-23 03:35:09.913680    1
2018-05-04 17:51:35.087253    1
2019-03-31 12:23:13.140966    1
2018-05-16 22:40:25.951193    1
2018-12-10 18:34:19.787439    1
2018-12-21 05:06:43.857713    1
2018-08-25 16:00:09.533240    1
2018-12-01 06:54:23.232496    1
Name: created_date, Length: 2739658, dtype: int64

The maximum mnumber of transactions on a single day are just 4, let us roll up the days into weekdays, weeks, months, etc and check

In [46]:
datetime.strptime('January 11, 2010', '%B %d, %Y').strftime('%a')
Out[46]:
'Mon'
In [47]:
transactions["dayofweek"] = transactions.created_date.apply(lambda x : x.strftime('%a'))

Which week day did most transactions take place?

In [48]:
plot_count_bar(transactions,"dayofweek")

All the days of the week seem to have similar transaction volume

- Not sure if this is due to masked data

Which month had the most number of transactions?

In [49]:
transactions["transactionmonth"] = transactions.created_date.apply(lambda x : x.strftime('%m'))
In [50]:
plot_count_bar(transactions, "transactionmonth")

March and April have most transactions, there was dip in the number of transactions in the month of july followed by a steady rise again

In [51]:
transactions.columns
Out[51]:
Index(['transaction_id', 'transactions_type', 'transactions_currency',
       'amount_usd', 'transactions_state', 'ea_cardholderpresence',
       'ea_merchant_mcc', 'ea_merchant_city', 'ea_merchant_country',
       'direction', 'user_id', 'created_date', 'dayofweek',
       'transactionmonth'],
      dtype='object')

Transaction amount distribution

In [52]:
# plot_multi_box(transactions,"transactions_type", "amount_usd")

There are outliers in the trasfer group, more than 10Billion USD were transferred!, this makes the plot out of scale and is not helping with understanding the other modes of engagement, I will filter the transactions above 1 Billion USD and continue the analysis for Exploratory purposes

Note : Plotting the transactions is slowing down my system, so I will take a sizable sample of the transactions and continue the analysis

In [53]:
transactions_sample = transactions.sample(frac = 0.1)
In [54]:
plot_multi_box(transactions_sample,"transactions_type", "amount_usd")
In [55]:
transactions_sample.groupby("transactions_type").agg({"amount_usd" : [np.median,np.mean,np.min,np.max,np.std]}).reset_index()
Out[55]:
transactions_type amount_usd
median mean amin amax std
0 ATM 42.55 73.062615 1.52 2.127660e+03 9.726127e+01
1 CARD_PAYMENT 7.50 21.404212 0.00 1.948648e+04 9.963976e+01
2 CARD_REFUND 14.00 86.974690 0.01 4.000000e+04 1.150234e+03
3 CASHBACK 0.01 0.090597 0.00 4.243000e+01 6.687181e-01
4 EXCHANGE 17.02 215.687054 0.00 6.127665e+05 4.969917e+03
5 FEE 5.10 8.473084 0.00 1.200000e+02 1.564451e+01
6 REFUND 6.80 598.908114 0.55 6.069898e+04 4.808166e+03
7 TAX 0.10 0.099928 0.07 1.100000e-01 1.895248e-03
8 TOPUP 42.55 180.042426 0.01 1.024719e+05 1.073256e+03
9 TRANSFER 1.80 25346.007437 0.00 8.510645e+08 3.908857e+06

Findings

  • As discussed above ,some transfers are in excess of 1 Billion USD, while most transfers are around 2 USD. need to look further
  • The standard deviation of the transfers is also highest, in the order of 250 Million USD, this is because of the high number in this mode of transaction

Filtering transactions above 1 Million USD

In [14]:
transactions_filtered = transactions[transactions.amount_usd <= 1e4]
In [15]:
transactions.shape[0] - transactions_filtered.shape[0]
Out[15]:
1133
In [58]:
((transactions.shape[0] - transactions_filtered.shape[0])/transactions.shape[0])*100
Out[58]:
0.04134923314142861

There are 1133 High value transactions in the dataset, above 10,000 USD.

  • This is just 0.05% of the transaction volume, these instances can be treated separately, if I include these numbers the summary values are being highly skewed to the right. I am filtering these for the purpose of this analysis.

    These values might be important if we are doing a fraud analysis with the dataset

In [59]:
transactions_filtered.groupby("transactions_type").agg({"amount_usd" : [np.median,np.mean,np.min,np.max,np.std]}).reset_index()
Out[59]:
transactions_type amount_usd
median mean amin amax std
0 ATM 42.55 73.933033 0.36 2553.19 98.164374
1 CARD_PAYMENT 7.54 21.380516 0.00 10000.00 82.414922
2 CARD_REFUND 13.98 61.766813 0.00 8170.22 209.587365
3 CASHBACK 0.01 0.089926 0.00 57.47 0.653553
4 EXCHANGE 17.79 146.754740 0.00 10000.00 566.910775
5 FEE 5.10 8.086473 0.00 120.00 14.683073
6 REFUND 6.80 109.011087 0.07 8747.55 548.015561
7 TAX 0.10 0.099986 0.07 0.11 0.001064
8 TOPUP 42.55 154.387330 0.00 10000.00 454.050802
9 TRANSFER 1.79 57.018991 0.00 10000.00 324.705157

Findings

  • After filtering the High value transactions (above 1 Million USD), the numbers make more sense
  • Bulk of the transactions in ATM and TOPUP mode have a median of 42 USD.
  • Most transfers are less than 2 USD, with a standard deviation of 320USD.
  • Users are exchanging about 18USD
In [17]:
plot_multi_box(transactions_filtered.sample(frac=0.3),"transactions_type", "amount_usd")

The ATM transactions are unde 2000 USD, while card payment, transfer, exchage and topup seem to be goin up tp 10,000 USD

Was the card holder present when these high value transactions happened?

In [60]:
num_hv_transactions  = transactions.shape[0] - transactions_filtered.shape[0]
In [61]:
def card_holder_presence(x):
    if x == "TRUE":
        return True
    else:
        return False
    
In [62]:
transactions.ea_cardholderpresence = transactions.ea_cardholderpresence.apply(lambda x : card_holder_presence(x))
In [63]:
num_time_user_not_present_hv = np.sum(transactions.ea_cardholderpresence[transactions.amount_usd >1e4])
In [64]:
num_time_user_not_present_hv
Out[64]:
6
In [65]:
num_hv_transactions
Out[65]:
1133
In [66]:
(num_time_user_not_present_hv/num_hv_transactions) *100
Out[66]:
0.529567519858782

%.3% of the time whe n high value transactions took place, the user was not present

In [67]:
num_time_user_not_present_lv = np.sum(transactions.ea_cardholderpresence[transactions.amount_usd <= 1e4])
In [68]:
num_time_user_not_present_lv
Out[68]:
288483
In [69]:
num_time_user_not_present_lv/transactions.ea_cardholderpresence[transactions.amount_usd  <=1e4].shape[0]
Out[69]:
0.10532643626626632

The user was not present in 10% of the low value transactions

In [72]:
transactions.to_csv("data/transactions_processed.csv", index = False)
users.to_csv("data/users_processed.csv", index = False)
notifications.to_csv("data/notifications_processed.csv", index= False)